from IPython.display import Image
Image(filename='imgs/banner.png')
%load_ext pretty_jupyter
import pendulum
Monday, 20 October 2025 at 07:55 PM (PDT)
%%html
<style>
#Styling {
font-weight: bold;
font-family: Helvetica;
}
</style>
goal¶
Notes about the notebook.
setup data¶
# dependencies
import sys
import re
import pandas as pd
from ipywidgets import HTML
sys.path.append(".")
import Summary
import Relative_risk
import Chi_square
# support methods
def format_count(v):
return "{:,}".format(v)
def format_prop(prop, decn=1, asperc=True):
if asperc: prop = prop*100
return "{}%".format(round(prop, decn))
def formatprop_num(num, den, aspercent, dec):
if num == 0: return 0
v = num/den
if not aspercent: out = round(v, dec)
else: out = round(v * 100, dec)
return out
def formatprop_str(num, den, aspercent, dec):
if num == 0: return '0'
v = num/den
if not aspercent: out = f"%.{dec}f" % v
else: out = f"%.{dec}f" % (v*100)
if int(float(out)) == 0: return 'less than 1'
return out
def formatprop(num, den, numeric, aspercent=False, dec=1):
assert (den >= num)
if numeric: out = formatprop_num(num=num, den=den, aspercent=aspercent, dec=dec)
else: out = formatprop_str(num=num, den=den, aspercent=aspercent, dec=dec)
return out
def report_fields(df, idcol, cols, fillval='NO DATA'):
data = df[[idcol] + cols].drop_duplicates()
if fillval:
count = data[cols].fillna(fillval).value_counts().to_frame().reset_index()
perc = data[cols].fillna(fillval).value_counts(normalize=True).to_frame().reset_index(
).rename(columns={'proportion': 'percent'})
else:
count = data[cols].value_counts().to_frame().reset_index()
perc = data[cols].value_counts(normalize=True).to_frame().reset_index(
).rename(columns={'proportion': 'percent'})
count['count'] = count['count'].apply(lambda x: f"{x:,}")
perc.percent = perc.percent.apply(lambda x: f"{x*100:.2f}%")
out = pd.merge(count, perc, on=cols)
return out
def get_census():
# county numbers from USCB DHC https://data.census.gov/table/DECENNIALDHC2020.P9?t=Race%20and%20Ethnicity&g=050XX00US06075
magic = {}
magic['county'] = {
'total': {'n': 873965},
'Black': {'n': 45071}, # 'Black or African American alone'
'White': {'n': 341306}, # 'White alone'
'Latine': {'n': 136761}, # 'Hispanic or Latino'
'Asian': {'n': 294220 + 3244 + 1570}, # 'Asian alone' + 'Native Hawaiian and Other Pacific Islander alone' + 'American Indian and Alaska Native alone'
'Other/Unknown': {'n': 6347 + 45446}, # 'Some Other Race alone' + 'Population of two or more races'
}
realgroups = [k for k in magic['county'].keys() if k != 'total']
assert magic['county']['total']['n'] == sum([magic['county'][group]['n'] for group in realgroups]), f"\
Census counts by race group should add up to the total population count.\
Found {magic['county']['total']['n']} for total population and {sum([magic['county'][group]['n'] for group in realgroups])} group total."
for race_group in realgroups:
magic['county'][race_group]['prop'] = formatprop(
num=magic['county'][race_group]['n'],
den=magic['county']['total']['n'],
numeric=True, aspercent=False, dec=5)
for race_group in realgroups:
magic['county'][race_group]['perc'] = formatprop(
num=magic['county'][race_group]['n'],
den=magic['county']['total']['n'],
numeric=False, aspercent=True, dec=1)
return magic
# main
"""This is the processed version of publicly available data requested by the ACLU from the San Francisco District Attorney's Office ("SFDA") covering 2015-2022. Note that the SFDA did not necessarily produce the data themselves and it may also reflect SFPD or SF Sheriff entries."""
sfda = pd.read_parquet("sfda.parquet")
magic = get_census()
# support for breaking down data
cols = {
'meta': [
'source', 'filename', 'sheet', 'incident_number', 'court_number',],
'person': [
'age_at_arrest', 'gender', 'race', 'ethnicity', 'ethnicity_group',],
'booking': [
'incident_number', 'arrest_date', 'age_at_arrest',
'booked_case_type', 'booked_charge_list',],
'filing': [
'court_number', 'filing_date',
'filed_case_type', 'filed_charge_list', 'description',
'case_dispo_date', 'case_dispo', 'dispo_description', 'dispo_description_group',
'status_ctnum', 'status_ctnum_agg', 'status_ctnum_group',],
}
grouped = [c for collist in cols.values() for c in collist]
lost = [c for c in sfda.columns if c not in grouped]
assert not any(lost), f"\
previously had grouped all available columns, however {
lost} have not been grouped."
preview data¶
Sample metadata.
| 75254 | |
|---|---|
| source | ACLU_PRA |
| filename | 2015-2022_San Francisco County_Proseuction Data (1).xlsx |
| sheet | Arrests & DA Actions|Cases Filed |
| incident_number | Z20201005-20010789 |
| court_number | 20010789 |
Sample defendant data.
| 73547 | |
|---|---|
| age_at_arrest | 27.0 |
| gender | Male |
| race | Black |
| ethnicity | Black or African American |
| ethnicity_group | Black |
Sample booking data.
| 4868 | |
|---|---|
| incident_number | Z20171212-02506701 |
| arrest_date | 2017-12-12 00:00:00 |
| age_at_arrest | NaN |
| booked_case_type | None |
| booked_charge_list | None |
Sample filing data.
| 49450 | |
|---|---|
| court_number | 18008054 |
| filing_date | NaT |
| filed_case_type | None |
| filed_charge_list | None |
| description | Assault |
| case_dispo_date | NaT |
| case_dispo | NaN |
| dispo_description | None |
| dispo_description_group | None |
| status_ctnum | None |
| status_ctnum_agg | Discharged w/o further action |
| status_ctnum_group | None |
coverage¶
This dataset includes:
- arrests (presumably by SFPD) between January 11, 2011 and December 31, 2021.
- cases filed by the SFDA between January 2, 2015 and December 30, 2021.
Note that these data are the result of a PRA request to the SFDA and so are more accurately a reflection of the cases filed between 2015-2021, which happen to cover some arrests going back to 2011.
more about the filing dates¶
| statistic | filing_date | |
|---|---|---|
| 0 | count | 46613 |
| 1 | mean | 2018-05-09 18:49:05.770493184 |
| 2 | min | 2015-01-02 00:00:00 |
| 3 | 25% | 2016-09-12 00:00:00 |
| 4 | 50% | 2018-05-21 00:00:00 |
| 5 | 75% | 2019-10-31 00:00:00 |
| 6 | max | 2021-12-30 00:00:00 |
assert sfda.filename.str.contains('San Francisco County').all()
setting up labels¶
LABELS are plain text phrases that explain what a unit of a variable represents in this context. For example, we often describe a case_filed value as 'a case against a defendant'.
These phrases are laced into the analytical template for presenting findings in plain text, so pay close attention to how the sentences read in the results to confirm it makes sense and appears correct.
# add case-based indicators
sfda['case_filed'] = sfda.court_number.notna()
sfda['conviction_any'] = sfda.dispo_description_group.isin((
'imprisonment', 'jail_probation', 'other_conviction'))
# add charge-based indicators
pcs = {
148: '148[A-Z]*|148\\.',
211: '211',
}
for code, patt in pcs.items():
for stage in ('booked', 'filed'): # note lack of a `convicted_charge_list` field
chargecol = f'{stage}_charge_list'
sfda[f'pc_{code}_{stage}'] = sfda[chargecol].str.contains(patt, na=False, flags=re.I)
sfda['booked_148_only'] = sfda.pc_148_booked & sfda.booked_charge_list.apply(
lambda x: len(x.split(',')) == 1 if x else False)
LABELS = {
'ethnicity_group': 'someone with a recorded race/ethnicity of',
'def_black': 'a Black defendant',
'def_white': 'a White defendant',
'case_filed': 'a case against a defendant',
'conviction_any': 'conviction on at least one charge', # the data are not more specific than this
'pc_148_booked': 'a booked charge for resisting arrest',
'booked_148_only': 'arrest(s) solely for resisting arrest',
'pc_148_filed': 'a filed charge for resisting arrest',
'pc_211_booked': 'a booked charge for robbery',
'pc_211_filed': 'a filed charge for robbery',
}
sample charge indicator: variation of pc_148_booked¶
I filtered for cases in which the sole booked charge was PC 148, resisting arrest.
default value_counts() presentation¶
| pc_148_booked | count | |
|---|---|---|
| 0 | False | 76840 |
| 1 | True | 8243 |
| booked_148_only | count | |
|---|---|---|
| 0 | False | 84283 |
| 1 | True | 800 |
Summary module¶
background¶
Summary.Summary.__init__
<function Summary.Summary.__init__(self, df, params, labels)>
Calculation:
- Table: df[[INDICATOR_COL, GROUP_COL]].groupby(GROUP_COL)[INDICATOR_COL].sum()
- Description: Summarize by {GROUP_COL} all records where {INDICATOR_COL} is True Present:
- Count/Percent: '{GROUP_COL.sum()/INDICATOR_COL.sum()*100}% ({GROUP_COL.sum()} of {INDICATOR_COL.sum()})'
- Finding:
- Of the {INDICATOR_COL.sum()},
- {magic['GROUP_COUNTS'][GROUP_LABEL]} were for {GROUP_LABEL}
- (repeated for each group appearing in GROUP_COL)
- Of the {INDICATOR_COL.sum()},
Summary.PARAMS
['INDICATOR_COL', 'INDICATOR_OP', 'GROUP_COL', 'RENAMER']
setup¶
pc148_sum = Summary.Summary(
df=sfda,
params={
'INDICATOR_COL': 'booked_148_only',
'INDICATOR_OP': 'are of',
'GROUP_COL': 'ethnicity_group',
'RENAMER': {
'ethnicity_group': 'Race/Ethnicity',
True: 'PC 148 alone',
False: 'Other charge(s)'
}
},
labels=LABELS,
)
text based summary: distribution across groups¶
Of the 800 arrest(s) solely for resisting arrest,
- 48 or 6.0% are of someone with a recorded race/ethnicity of Asian.
- 304 or 38.0% are of someone with a recorded race/ethnicity of Black.
- 170 or 21.2% are of someone with a recorded race/ethnicity of Latine.
- 22 or 2.8% are of someone with a recorded race/ethnicity of Other/Unknown.
- 256 or 32.0% are of someone with a recorded race/ethnicity of White.
magic numbers: distribution across groups¶
pc148_sum_gc = pc148_sum.getmagic()['GROUP_COUNTS'].copy()
pc148_sum.getmagic()
{'INDICATOR_COL': 'booked_148_only',
'INDICATOR_OP': 'are of',
'GROUP_COL': 'ethnicity_group',
'RENAMER': {'ethnicity_group': 'Race/Ethnicity',
True: 'PC 148 alone',
False: 'Other charge(s)'},
'INDICATOR_COUNT': np.int64(800),
'GROUP_COUNTS': {'Asian': 48,
'Black': 304,
'Latine': 170,
'Other/Unknown': 22,
'White': 256},
'GROUP_PERCENTS': {'Asian': 6.0,
'Black': 38.0,
'Latine': 21.25,
'Other/Unknown': 2.75,
'White': 32.0}}
table: distribution across groups¶
Of the 85,083 cases considered, there are 800 arrest(s) solely for resisting arrest, with the following distribution:
| Asian | Black | Latine | Other/Unknown | White | Total | ||
|---|---|---|---|---|---|---|---|
| 0 | Other charge(s) | 6620 (7.9%) | 30337 (36.0%) | 18386 (21.8%) | 6841 (8.1%) | 22099 (26.2%) | 84283 |
| 1 | PC 148 alone | 48 (6.0%) | 304 (38.0%) | 170 (21.2%) | 22 (2.8%) | 256 (32.0%) | 800 |
| 2 | Total | 6668 (7.8%) | 30641 (36.0%) | 18556 (21.8%) | 6863 (8.1%) | 22355 (26.3%) | 85083 |
table: distribution within groups¶
Of the 85,083 cases considered, there are 800 arrest(s) solely for resisting arrest, with the following distribution:
| Race/Ethnicity | Other charge(s) | PC 148 alone | Total | |
|---|---|---|---|---|
| 0 | Asian | 6620 (99.3%) | 48 (0.7%) | 6668 |
| 1 | Black | 30337 (99.0%) | 304 (1.0%) | 30641 |
| 2 | Latine | 18386 (99.1%) | 170 (0.9%) | 18556 |
| 3 | Other/Unknown | 6841 (99.7%) | 22 (0.3%) | 6863 |
| 4 | White | 22099 (98.9%) | 256 (1.1%) | 22355 |
| 5 | Total | 84283 (99.1%) | 800 (0.9%) | 85083 |
Relative_risk module¶
For the contingency table to work, you need to setup a supplemental variable that is a version of the race/ethnicity field for each comparison group (ie. Black vs. White).
You could avoid setting up this variable by providing a filtered df that only includes records related to the comparison, as long as your comparison only involves groups in the data (ie. not comparing to non-Black defendants when you don't have that label encoded in the group variable).
background¶
Relative_risk.Contingency.__init__
<function Relative_risk.Contingency.__init__(self, df, params, labels)>
Calculation:
- Table:
- Description: Present:
- Contingency table:
- Finding:
Relative_risk.CONTINGENCY_PARAMS
['COMPARISON_GROUP_COL', 'OUTCOME_EVENT_COL', 'GIVEN_EVENT_COL']
Relative_risk.RATIO_PARAMS
['TREAT_GROUP_COL', 'CONTROL_GROUP_COL', 'OUTCOME_EVENT_OP', 'OUTCOME_EVENT_COL', 'GIVEN_EVENT_COL']
setup¶
sfda['def_black'] = sfda.ethnicity_group == 'Black'
sfda['def_white'] = sfda.ethnicity_group == 'White'
sfda['def_black_or_hispanic'] = sfda.ethnicity_group.isin(('Black', 'Latine'))
sfda['def_not_black'] = ~sfda.def_black
# setup comparison groups based on race data
sfda.loc[(sfda.def_black) | (sfda.def_white), 'comparison_group_1'] = sfda.ethnicity_group
sfda.loc[sfda.def_black, 'comparison_group_2'] = sfda.ethnicity_group
sfda.loc[sfda.def_not_black, 'comparison_group_2'] = 'Race other than Black'
sfda.loc[sfda.def_white, 'comparison_group_3'] = sfda.ethnicity_group
sfda.loc[sfda.def_black_or_hispanic, 'comparison_group_3'] = 'Black or Hispanic'
pc148_rr_tbl = Relative_risk.Contingency(
df=sfda,
params={
'COMPARISON_GROUP_COL': 'comparison_group_1',
'OUTCOME_EVENT_COL': 'pc_148_filed',
'GIVEN_EVENT_COL': 'booked_148_only',
},
labels=LABELS,
)
pc148_rr_sum = Relative_risk.Ratio(
df=sfda,
params={
'TREAT_GROUP_COL': 'def_black',
'CONTROL_GROUP_COL': 'def_white',
'OUTCOME_EVENT_OP': 'faces',
'OUTCOME_EVENT_COL': 'pc_148_filed',
'GIVEN_EVENT_COL': 'booked_148_only',
},
labels=LABELS,
)
| a filed charge for resisting arrest | Black | White | Total | |
|---|---|---|---|---|
| 0 | True | 10 | 15 | 25 |
| 1 | False | 294 | 241 | 535 |
| 2 | Total | 304 | 256 | 560 |
The ratio of the probability that a Black defendant faces a filed charge for resisting arrest compared to the probability that a White defendant faces a filed charge for resisting arrest is 0.561.
In other words, the relative risk that a Black defendant faces a filed charge for resisting arrest is 43.9% less than a White defendant.
Chi_square module¶
This module has been setup to use the numbers exported from the Summary module.
background¶
Chi_square.DEFAULTS
{'SIG': 0.05, 'DDOF': 0}
Chi_square.CENSUS_PARAMS
['CENSUS_DICT', 'OBSERVED_DICT', 'NULL_PHRASE', 'SIG', 'DDOF']
Chi_square.EQUAL_PARAMS
['OBSERVED_DICT', 'NULL_PHRASE', 'SIG', 'DDOF']
params = {
'Census': {
'OBSERVED_DICT': pc148_sum_gc,
'CENSUS_DICT': {k:v for k,v in magic['county'].items() if k != 'total'},
'NULL_PHRASE': f"the distribution by race of {
LABELS['booked_148_only']} follows the distribution of the general population"
},
'Equal': {
'OBSERVED_DICT': pc148_sum_gc,
'NULL_PHRASE': f"the distribution by race of {
LABELS['booked_148_only']} is equal across all groups"
},
}
Comparing racial distribution to that of the census¶
This test compares whether observed racial proportions match the proportion of each racial group in the general population. In interpreting the results, a p-value below 0.05 will be considered statistically significant.
This test results in a p-value of < 0.0001, which is a statistically significant difference and rejects the null hypothesis that the distribution by race of arrest(s) solely for resisting arrest follows the distribution of the general population.
Comparing racial distribution to if each group were treated equally¶
This test compares whether observed racial proportions match the proportion of each racial group in the general population. In interpreting the results, a p-value below 0.05 will be considered statistically significant.
This test results in a p-value of < 0.0001, which is a statistically significant difference and rejects the null hypothesis that the distribution by race of arrest(s) solely for resisting arrest is equal across all groups.